## datawork.r
## Vito D'Orazio
## June 11, 2014
## This script is intended to demonstrate some fundamental R functions for high level data work.


# clear environment and set working directory
rm(list=ls())
setwd("/Users/vjdorazio/Desktop/IQSS/privacy_tools/R_workshop")
getwd()

# load .Rdata
load("PUMS5Extract.Rdata")
# view objects in your current environment
ls()

# write the data to a csv file in your working directory
write.table(mydata, "PUMS5Extract.csv", sep=",", col.names=TRUE, row.names=FALSE)

# read data
mydata <- read.table("/Users/vjdorazio/Desktop/IQSS/privacy_tools/R_workshop/PUMS5extract.csv", header=TRUE, sep=",")

# or alternatively...
mydata2 <- read.csv("PUMS5extract.csv")
# remove mydata2 from the environment
rm(mydata2)

# so what are we working with?
head(mydata)
colnames(mydata)
rownames(mydata)
dim(mydata)
nrow(mydata)
ncol(mydata)
#fix(mydata) # data table viewer

## split data in half by row
# halfrow is the equal to the number of rows in the data divided by 2
# a is a subset of mydata from row 1 to row halfrow
# b is a subset of mydata from row halfrow+1 to the last row

halfrow <- trunc(nrow(mydata)/2)
a <- mydata[1:halfrow,]
b <- mydata[(halfrow+1):nrow(mydata),]

## split data by column
halfcol <- trunc(ncol(mydata)/2)
c <- b[,(halfcol+1):ncol(b)]
b <- b[,1:halfcol]

rm(mydata)

## split by columns selected using variable names
dcols <- c("X", "age")
bcols <- c("X", "state", "puma", "sex")
d <- b[,dcols]
b <- b[,bcols]

# create variable "random", order by that variable, drop variable
d$random <- rnorm(n=nrow(d))
d <- d[order(d$random),]
d$random <- NULL

## now reassemble mydata from a, b, c, and d
# make sure to always check your dimensions, classes, etc. as you work.  R is weakly typed and can change things without telling you
e <- merge(x=b, y=d, by="X")
dim(e)
colnames(e)
e <- e[order(e$X),]
class(e)
e <- cbind(e, c)
class(e)
e <- rbind(a, e)
dim(e)
load("PUMS5extract.Rdata")
ls()
dim(mydata)
which(e != mydata) #comparison, equals 0 if the same





